library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library("apeglm")
library("ggplot2")
library("vsn")
library("pheatmap")
library("RColorBrewer")
The names of files have been changed by bash:
add suffix “Ductgroup” in the htseqcounts files of Duct group
add suffix “Lobulargroup” in the htseqcounts files of Lobular group
put them together in one folder named Lobular_Ductr to point it to this directory
The folder has been uploaded to Google Drive
directory <- "/Users/margery/Desktop/data/Lobular_Duct/"
#in my computer
#direction<- "~/TRGN510_Final_Project_Data/Lobular_Duct"
sample condition & sampleFilesExtract the information of Condition directly from file names which could guarantee the one-to-one correspondence of expressed matrix and samples
Use grep to select those files containing string group Use sub to chop up the sample filename to obtain the condition status
sampleFiles <- grep("group",list.files(directory),value=TRUE)
sampleCondition <- sub("(.*group).*","\\1",sampleFiles)
sampleTable <- data.frame(sampleName = sampleFiles,
fileName = sampleFiles,
condition = sampleCondition)
sampleTable$condition <- factor(sampleTable$condition)
library("DESeq2")
dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = directory,
design= ~ condition)
dds
## class: DESeqDataSet
## dim: 60483 265
## metadata(1): version
## assays(1): counts
## rownames(60483): ENSG00000000003.13 ENSG00000000005.5 ...
## ENSGR0000280767.1 ENSGR0000281849.1
## rowData names(0):
## colnames(265): Ductgroup-TCGA-3C-AALK-01A-11R-A41B-07.htseq.counts.gz
## Ductgroup-TCGA-A2-A04N-01A-11R-A115-07.htseq.counts.gz ...
## Lobulargroup-TCGA-WT-AB44-01A-11R-A41B-07.htseq.counts.gz
## Lobulargroup-TCGA-XX-A89A-01A-11R-A36F-07.htseq.counts.gz
## colData names(1): condition
60483
Remove the genes with few reads (less than 10 ) to reduce the memory size and increase the speed
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
After filtering, the number of elements has decreased from 60483 to 50860
By default, R will choose a reference level for factors based on alphabetical order
dds$condition
## [1] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [6] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [11] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [16] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [21] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [26] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [31] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [36] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [41] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [46] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [51] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [56] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [61] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [66] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [71] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [76] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [81] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [86] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [91] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [96] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [101] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [106] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [111] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [116] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [121] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [126] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [131] Ductgroup Ductgroup Ductgroup Ductgroup Ductgroup
## [136] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [141] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [146] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [151] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [156] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [161] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [166] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [171] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [176] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [181] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [186] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [191] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [196] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [201] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [206] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [211] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [216] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [221] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [226] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [231] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [236] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [241] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [246] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [251] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [256] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## [261] Lobulargroup Lobulargroup Lobulargroup Lobulargroup Lobulargroup
## Levels: Ductgroup Lobulargroup
In this case , Ductgroup is the reference , define it manually to make sure
log2 fold change and Wald test p value : last level / reference level log2 fold change : log2 (Lobulargroupt / Ductgroup)
Use function DESeq to do differential expression analysis Use function results to generate results tables with log2 fold changes, p values and adjusted p values.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 10467 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds)
res
## log2 fold change (MLE): condition Lobulargroup vs Ductgroup
## Wald test p-value: condition Lobulargroup vs Ductgroup
## DataFrame with 50860 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.13 2994.2734 0.0420056 0.0994397 0.422423 6.72717e-01
## ENSG00000000005.5 53.8553 1.0533486 0.2281769 4.616369 3.90512e-06
## ENSG00000000419.11 2071.6910 -0.3423046 0.0628977 -5.442241 5.26143e-08
## ENSG00000000457.12 2086.8536 0.1151750 0.0637844 1.805693 7.09663e-02
## ENSG00000000460.15 744.0237 -0.1224067 0.0822178 -1.488810 1.36537e-01
## ... ... ... ... ... ...
## ENSG00000281909.1 0.854753 0.4452084 0.2666805 1.6694448 0.095029260
## ENSG00000281910.1 0.422694 -0.0534396 0.5809119 -0.0919927 0.926703856
## ENSG00000281912.1 97.306569 -0.0323205 0.0968381 -0.3337582 0.738561999
## ENSG00000281918.1 2.374675 0.5346585 0.2594000 2.0611354 0.039290120
## ENSG00000281920.1 5.935921 0.7656355 0.1668402 4.5890348 0.000004453
## padj
## <numeric>
## ENSG00000000003.13 7.99464e-01
## ENSG00000000005.5 6.50184e-05
## ENSG00000000419.11 1.81304e-06
## ENSG00000000457.12 1.69797e-01
## ENSG00000000460.15 2.74298e-01
## ... ...
## ENSG00000281909.1 2.10845e-01
## ENSG00000281910.1 9.60304e-01
## ENSG00000281912.1 8.44083e-01
## ENSG00000281918.1 1.08932e-01
## ENSG00000281920.1 7.23534e-05
Export the results to csv file
write.csv(res,file="Lobular_Duct_res.csv")
Use function lfcShrink to shrink the LFC apeglm: (Zhu, Ibrahim, and Love 2018) effect size shrinkage,which improves on the previous estimator
resultsNames(dds)
## [1] "Intercept" "condition_Lobulargroup_vs_Ductgroup"
resLFC <- lfcShrink(dds, coef="condition_Lobulargroup_vs_Ductgroup", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC
## log2 fold change (MAP): condition Lobulargroup vs Ductgroup
## Wald test p-value: condition Lobulargroup vs Ductgroup
## DataFrame with 50860 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.13 2994.2734 0.0398904 0.0911447 6.72717e-01 7.99464e-01
## ENSG00000000005.5 53.8553 0.9556161 0.2381667 3.90512e-06 6.50184e-05
## ENSG00000000419.11 2071.6910 -0.3288230 0.0629304 5.26143e-08 1.81304e-06
## ENSG00000000457.12 2086.8536 0.1079616 0.0620041 7.09663e-02 1.69797e-01
## ENSG00000000460.15 744.0237 -0.1095951 0.0785919 1.36537e-01 2.74298e-01
## ... ... ... ... ... ...
## ENSG00000281909.1 0.854753 0.2339201 0.2373185 0.095029260 2.10845e-01
## ENSG00000281910.1 0.422694 -0.0631372 0.2217272 0.926703856 9.60304e-01
## ENSG00000281912.1 97.306569 -0.0273729 0.0890516 0.738561999 8.44083e-01
## ENSG00000281918.1 2.374675 0.3136781 0.2636270 0.039290120 1.08932e-01
## ENSG00000281920.1 5.935921 0.6980221 0.1739988 0.000004453 7.23534e-05
resLFC is more compacted compared to res column stat is removed after shrinking
names(resLFC)
## [1] "baseMean" "log2FoldChange" "lfcSE" "pvalue"
## [5] "padj"
names(res)
## [1] "baseMean" "log2FoldChange" "lfcSE" "stat"
## [5] "pvalue" "padj"
Use parallel=TRUE and BPPARAM=MulticoreParam(4) to split the job over 4 cores
library("BiocParallel")
register(MulticoreParam(4))
order our results table by the smallest p value
resOrdered <- res[order(res$pvalue),]
summary(res)
##
## out of 50840 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 6735, 13%
## LFC < 0 (down) : 6960, 14%
## outliers [1] : 0, 0%
## low counts [2] : 11848, 23%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Find out How many adjusted p-values were less than 0.05?
sum(res$padj < 0.05, na.rm =T)
## [1] 11017
By default the argument alpha is set to 0.1, set the alpha = 0.05 means set the statistical significance to be 0.05
res05 <- results(dds, alpha=0.05)
summary(res05)
##
## out of 50840 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 5318, 10%
## LFC < 0 (down) : 5751, 11%
## outliers [1] : 0, 0%
## low counts [2] : 12834, 25%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
diff_gene_deseq2 <-subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
dim(diff_gene_deseq2)
## [1] 1490 6
head(diff_gene_deseq2)
## log2 fold change (MLE): condition Lobulargroup vs Ductgroup
## Wald test p-value: condition Lobulargroup vs Ductgroup
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000005.5 53.8553 1.05335 0.228177 4.61637 3.90512e-06
## ENSG00000001626.13 28.7633 1.19972 0.200847 5.97327 2.32548e-09
## ENSG00000003249.12 2114.7789 -1.00437 0.111452 -9.01169 2.02901e-19
## ENSG00000004799.7 4025.7035 1.03231 0.170746 6.04589 1.48586e-09
## ENSG00000006377.10 12.2752 -2.37282 0.269294 -8.81127 1.23735e-18
## ENSG00000010438.15 15.1358 -1.52541 0.254267 -5.99924 1.98239e-09
## padj
## <numeric>
## ENSG00000000005.5 6.50184e-05
## ENSG00000001626.13 1.38041e-07
## ENSG00000003249.12 1.97889e-16
## ENSG00000004799.7 9.36451e-08
## ENSG00000006377.10 1.04938e-15
## ENSG00000010438.15 1.21028e-07
write.csv(diff_gene_deseq2,file = "DEG_Lobular_Duct.csv")
plotMA shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet.
plotMA(res, ylim=c(-2,2))
plotMA(resLFC, ylim=c(-2,2))
library(ashr)
resNorm <- lfcShrink(dds, coef=2, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=2, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
Examine the counts of reads for a single gene across the groups
Use function plotCounts to normalize counts by the estimated size factors and adds a pseudocount of 1/2 to allow for log scale plotting.
Here we specify the gene which had the smallest p value from the results table created above.
plotCounts(dds, gene=which.min(res$padj), intgroup="condition")
For customized plotting, an argument returnData specifies that the function should only return a data.frame for plotting with ggplot.
d <- plotCounts(dds, gene=which.min(res$padj), intgroup="condition",
returnData=TRUE)
library("ggplot2")
ggplot(d, aes(x=condition, y=count)) +
geom_point(position=position_jitter(w=0.1,h=0)) +
scale_y_log10(breaks=c(25,100,400))
Use function mcols to find which variables and tests were used
mcols(res)$description
## [1] "mean of normalized counts for all samples"
## [2] "log2 fold change (MLE): condition Lobulargroup vs Ductgroup"
## [3] "standard error: condition Lobulargroup vs Ductgroup"
## [4] "Wald statistic: condition Lobulargroup vs Ductgroup"
## [5] "Wald test p-value: condition Lobulargroup vs Ductgroup"
## [6] "BH adjusted p-values"
For a particular gene, a log2 fold change of -1 for condition logroup vs ductgroup means that the logroup induces a multiplicative change in observed gene expression level of 2^-1 = 0.5 compared to the untreated condition.
Use function vsd to remove the dependence of the variance on the mean instead of function rlog for it takes MUCH less time
vsd <- vst(dds, blind=FALSE)
It takes more than 24hrs, so I cut it off
#rld <- rlog(dds, blind=FALSE)
# this gives log2(n + 1)
ntd <- normTransform(dds)
library("vsn")
meanSdPlot(assay(ntd))
meanSdPlot(assay(vsd))
# Data quality assessment by sample clustering and visualization
df <- as.data.frame(sampleCondition)
library("pheatmap")
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
rownames(df) <- colnames(ntd)
pheatmap(assay(ntd)[select,], cluster_rows=FALSE,show_colnames=FALSE,
cluster_cols=FALSE,annotation = df,cellwidth=2)
pheatmap(assay(vsd)[select,], cluster_rows=FALSE,show_colnames=FALSE,
cluster_cols=FALSE,annotation = df,fontsize_row = 6)
normCounts <- counts(dds,normalized = T)
Apply the dist function to the transpose of the transformed count matrix to get sample-to-sample distances.
sampleDists <- dist(t(assay(vsd)))
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,show_rownames = F)
plotPCA(vsd, intgroup=c("condition"))
customize the PCA plot using the ggplot function.
pcaData <- plotPCA(vsd, intgroup=c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition, shape=condition)) +
geom_point(size=2) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()